Problem


As it can be seen in 20_04_08_Ridge.html, there is a relation between the score the model assigns to a gene and the gene’s mean level of expression. This is a problem because we had previously discovered a bias in the SFARI scores related to mean level of expression (Preprocessing/Gandal/AllRegions/RMarkdowns/20_04_03_SFARI_genes.html), which means that this could be a confounding factor in our model and the reason why it seems to perform well, so we need to remove this bias to recover the true biological signal that is mixed with it and improve the quality of our model.


Post-Processing Approach


After the model has been trained with the bias, perform a post-processing of the classifier outputs.

Since the effect of the bias is proportional to the mean level of expression of a gene, we can correct it by removing the effect of the mean expression from the probability of the model.

Problems:

library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(biomaRt)
library(mgcv)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)
library(caret)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load data

# Clusterings
clustering_selected = 'DynamicHybridMergedSmall'
clusterings = read_csv('./../Data/clusters.csv')
clusterings$Module = clusterings[,clustering_selected] %>% data.frame %>% unlist %>% unname

assigned_module = data.frame('ID' = clusterings$ID, 'Module' = clusterings$Module)

# Regression data
load(file='./../Data/Ridge_model_robust.RData')

# Mean Expression data
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# Dataset created with DynamicTreeMerged algorithm
clustering_selected = 'DynamicHybridMergedSmall'
original_dataset = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'), row.names=1)

# Add gene symbol
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(dataset), mart=mart)

test_set = predictions

rm(dds, datGenes, datMeta, clustering_selected, clusterings, mart)


Remove Bias


The relation between level of expression and probability assigned by the model could be modelled by a linear regression, but we would lose some of the behaviour. Fitting a curve using Generalised Additive Models seems to capture the relation in a much better way, with an \(R^2\) twice as large and no recognisable pattern in the residuals of the regression.

plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% 
            right_join(test_set, by='ID')

# Fit linear and GAM models to data
lm_fit = lm(prob ~ meanExpr, data = plot_data)
gam_fit = gam(prob ~ s(meanExpr), method = 'REML', data = plot_data)

plot_data = plot_data %>% mutate(lm_res = lm_fit$residuals, gam_res = gam_fit$residuals)

# Plot data
p1 = plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + geom_smooth(method='lm', color='gray', alpha=0.3) +
     xlab('Mean Expression') + ylab('Probability') + ggtitle('Linear Fit') + theme_minimal()

p2 = plot_data %>% ggplot(aes(meanExpr, prob)) + geom_point(alpha=0.1, color='#0099cc') + geom_smooth(method='gam', color='gray', alpha=0.3) +
     xlab('Mean Expression') + ylab('Probability') + ggtitle('GAM fit') + theme_minimal()

p3 = plot_data %>% ggplot(aes(meanExpr, lm_res)) + geom_point(alpha=0.1, color='#ff9900') + 
     geom_smooth(method='gam', color='gray', alpha=0.3) + xlab('Mean Expression') +
     ylab('Residuals') + theme_minimal() + ggtitle(bquote(paste(R^{2},' = ', .(round(summary(lm_fit)$r.squared, 4)))))

p4 = plot_data %>% ggplot(aes(meanExpr, gam_res)) + geom_point(alpha=0.1, color='#ff9900') + 
     geom_smooth(method='gam', color='gray', alpha=0.3) + xlab('Mean Expression') +
     ylab('Residuals') + theme_minimal() + ggtitle(bquote(paste(R^{2},' = ', .(round(summary(gam_fit)$r.sq, 4)))))


grid.arrange(p1, p2, p3, p4, nrow = 2)

rm(p1, p2, p3, p4, lm_fit)


Remove bias from scores with GAM fit


  • Assigning the residuals of the GAM model as the new model probability

  • Adding the mean probability of the original model to each new probability so our new probabilities have the same mean as the original ones

  • As with the plot above, the relation between mean expression and the probability assigned by the model is gone

# Correct Bias
test_set$corrected_score = gam_fit$residuals + mean(test_set$prob)

# Plot results
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% 
            right_join(test_set, by='ID')

plot_data %>% ggplot(aes(meanExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='gam', color='gray', alpha=0.3) + ylab('Corrected Score') + xlab('Mean Expression') +
              theme_minimal() + ggtitle('Mean expression vs Model score corrected using GAM')




Robust Accuracy AUC and ROC predictions

### DEFINE FUNCTIONS

# Create positive training set including all SFARI scores
positive_sample_balancing_SFARI_scores = function(p, seed){
  
  set.seed(seed)
  positive_train_idx = c()
  
  for(score in 1:6){
    score_genes = rownames(original_dataset)[rownames(original_dataset) %in% rownames(dataset) & original_dataset$gene.score == score]
    score_idx = which(rownames(dataset) %in% score_genes)
    score_train_idx = sample(score_idx, size = ceiling(p*length(score_idx)))
    
    positive_train_idx = c(positive_train_idx, score_train_idx)
  }
  
  return(positive_train_idx)
}

create_train_test_sets = function(p, over_sampling_fold, seed){
  
  ### CREATE POSITIVE TRAINING SET (balancing SFARI scores and over-sampling)
  positive_train_idx = positive_sample_balancing_SFARI_scores(p, seed)
  add_obs = sample(positive_train_idx, size = ceiling(over_sampling_fold*length(positive_train_idx)), replace=TRUE)
  positive_train_idx = c(positive_train_idx, add_obs)
  
  
  ### CREATE NEGATIVE TRAINING SET
  negative_idx = which(!dataset$SFARI)
  negative_train_idx = sample(negative_idx, size = length(positive_train_idx))
  
  
  ### CREATE TRAIN AND TEST SET
  train_set = dataset[sort(c(positive_train_idx, negative_train_idx)),]
  test_set = dataset[-unique(c(positive_train_idx, negative_train_idx)),]
  
  return(list('train_set' = train_set, 'test_set' = test_set))
  
}

run_model = function(p, over_sampling_fold, seed){
  
  # Create train and test sets
  train_test_sets = create_train_test_sets(p, over_sampling_fold, seed)
  train_set = train_test_sets[['train_set']]
  test_set = train_test_sets[['test_set']]
  
  # Train Model
  train_set$SFARI = train_set$SFARI %>% as.factor
  lambda_seq = 10^seq(1, -4, by = -.1)
  set.seed(123)
  fit = train(SFARI ~., data = train_set, method = 'glmnet', trControl = trainControl('cv', number = 10),
                tuneGrid = expand.grid(alpha = 0, lambda = lambda_seq))
  
  # Predict labels in test set
  predictions = fit %>% predict(test_set, type='prob')
  preds = data.frame('ID'=rownames(test_set), 'prob'=predictions$`TRUE`) %>% mutate(pred = prob>0.5)
  
  # Correct Bias
  bias_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% right_join(preds, by='ID')
  gam_fit = gam(prob ~ s(meanExpr), method = 'REML', data = bias_data)
  preds$prob = gam_fit$residuals + mean(preds$prob)
  preds$pred = preds$prob>0.5

  # Measure performance of the model
  acc = mean(test_set$SFARI==preds$pred)
  pred_ROCR = prediction(preds$prob, test_set$SFARI)
  AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]
  
  
  return(list('acc' = acc, 'AUC' = AUC, 'preds' = preds))
}


### RUN MODEL

# Parameters
p = 0.8
over_sampling_fold = 3
n_iter = 100
seeds = 123:(123+n_iter-1)

# Store outputs
acc = c()
AUC = c()
predictions = data.frame('ID' = rownames(dataset), 'SFARI' = dataset$SFARI, 'prob' = 0, 'pred' = 0, 'n' = 0)

for(seed in seeds){
  
  # Run model
  model_output = run_model(p, over_sampling_fold, seed)
  
  # Update outputs
  acc = c(acc, model_output[['acc']])
  AUC = c(AUC, model_output[['AUC']])
  preds = model_output[['preds']]
  update_preds = preds %>% dplyr::select(-ID) %>% mutate(n=1)
  predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] = predictions[predictions$ID %in% preds$ID, c('prob','pred','n')] + update_preds
}

predictions = predictions %>% mutate(prob = prob/n, pred_count = pred, pred = prob>0.5)

rm(p, over_sampling_fold, seeds, update_preds, positive_sample_balancing_SFARI_scores, create_train_test_sets, run_model)




Performance Metrics


Confusion matrix

conf_mat = test_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
                        apply_labels(SFARI = 'Actual Labels', 
                                     corrected_score = 'Corrected Score', 
                                     corrected_pred = 'Corrected Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 FALSE   TRUE   #Total 
 Actual Labels 
   FALSE  9730 5407 15137
   TRUE  358 539 897
   #Total cases  10088 5946 16034
rm(conf_mat)

Accuracy


The accuracy was expected to decrease because the bias was helping classify samples correctly, but for the wrong reasons. Although I was expecting the accuracy to change more.

cat(paste0('ACCURACY: mean = ', round(mean(acc),4), ' SD = ', round(sd(acc),4)))
## ACCURACY: mean = 0.6359 SD = 0.0071
old_acc = mean(test_set$SFARI==(test_set$prob>0.5))
acc = mean(acc)
cat(paste0('Accuracy decreased ',round(old_acc-acc,4), ' points'))
## Accuracy decreased 0.0046 points
rm(old_acc)

ROC Curve


AUC decreased 0.04

cat(paste0('AUC:      mean = ', mean(AUC), ' SD = ', sd(AUC)))
## AUC:      mean = 0.630819626188466 SD = 0.0199744636907748
pred_ROCR = prediction(test_set$corrected_score, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
auc = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(auc,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

rm(roc_ROCR, auc)

Lift Curve


Lift decreased from a starting point of around 10 to just 4. The genes most affected seem to have been the ones with the highest scores

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(lift_ROCR, pred_ROCR)

Analyse Model

Looks very similar to before, the means of each group are a bit closer together

plot_data = test_set %>% dplyr::select(corrected_score, SFARI)

ggplotly(plot_data %>% ggplot(aes(corrected_score, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
         geom_vline(xintercept = mean(plot_data$corrected_score[plot_data$SFARI]), color = '#00C0C2', linetype = 'dashed') +
         geom_vline(xintercept = mean(plot_data$corrected_score[!plot_data$SFARI]), color = '#FF7371', linetype = 'dashed') +
         theme_minimal() + ggtitle('Model score distribution by SFARI Label'))

The positive relation between SFARI scores and Model scores is still there but is not as strong as before

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, corrected_score) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            dplyr::select(ID, corrected_score, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  25
   2  64
   3  191
   4  432
   5  163
   6  22
   None  15137
   #Total cases  16034
ggplotly(plot_data %>% ggplot(aes(gene.score, corrected_score, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of the Model scores by SFARI score') +
              xlab('SFARI score') + ylab('Model score') + theme_minimal())

Print genes with highest corrected scores in test set

test_set %>% dplyr::select(corrected_score, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(corrected_score)) %>% top_n(50, wt=corrected_score) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>% 
             left_join(gene_names, by = c('ID'='ensembl_gene_id')) %>%
             dplyr::rename('GeneSymbol' = external_gene_id, 'Probability' = corrected_score,
                           'ModuleDiagnosis_corr' = MTcor, 'GeneSignificance' = GS) %>%
             mutate(ModuleDiagnosis_corr = round(ModuleDiagnosis_corr,4), Probability = round(Probability,4), 
                    GeneSignificance = round(GeneSignificance,4)) %>%
             dplyr::select(GeneSymbol, GeneSignificance, ModuleDiagnosis_corr, Module, Probability, gene.score) %>%
             kable(caption = 'Genes with highest model probabilities from the test set')
Genes with highest model probabilities from the test set
GeneSymbol GeneSignificance ModuleDiagnosis_corr Module Probability gene.score
HUNK -0.3273 -0.6750 #D376FF 0.8422 None
FMNL1 -0.2223 -0.6031 #00BA38 0.8403 None
ETV6 -0.1418 -0.6031 #00BA38 0.8259 None
ZNF804A -0.2768 -0.6750 #D376FF 0.8248 3
TLE3 0.4884 0.1127 #FF62BC 0.8224 None
POM121C -0.2791 -0.6031 #00BA38 0.8210 None
SAP130 -0.2482 -0.6031 #00BA38 0.8179 None
PCDHB13 -0.0396 -0.4946 #B79F00 0.8178 None
AMPD3 -0.2810 -0.9514 #00C0AF 0.8130 None
SETD1B -0.1561 -0.6031 #00BA38 0.8110 4
RPRD2 -0.0922 -0.6031 #00BA38 0.8097 None
RFX7 0.1372 0.0586 #FE6E8A 0.8074 None
BCL9 0.1299 -0.6031 #00BA38 0.8072 None
SCAF4 0.0185 -0.6031 #00BA38 0.8044 None
PLAGL2 0.3941 0.1127 #FF62BC 0.8036 None
DROSHA -0.4111 -0.6031 #00BA38 0.7989 None
ITGAX -0.0073 -0.0094 #00A7FF 0.7978 None
CACNG3 -0.4689 -0.6031 #00BA38 0.7962 None
FOXP4 -0.2102 -0.6031 #00BA38 0.7929 None
FOXJ2 0.2507 0.1127 #FF62BC 0.7929 None
MIDN -0.0520 -0.6031 #00BA38 0.7924 None
C20orf112 0.0314 0.1127 #FF62BC 0.7913 None
RNF111 -0.2410 0.0586 #FE6E8A 0.7894 None
SBK1 -0.3553 -0.6031 #00BA38 0.7882 None
CCRN4L -0.0285 -0.0094 #00A7FF 0.7849 None
AQP1 -0.0924 -0.9514 #00C0AF 0.7840 None
TRIM67 -0.0556 0.1127 #FF62BC 0.7832 None
CACNG2 -0.6778 -0.6031 #00BA38 0.7827 None
SAMSN1 0.4229 0.7287 #39B600 0.7819 None
ZFPM2 0.2342 0.1127 #FF62BC 0.7817 None
IGF1 -0.2511 -0.6750 #D376FF 0.7807 None
PARVG 0.5666 0.7287 #39B600 0.7797 None
CHAMP1 -0.4093 -0.6031 #00BA38 0.7795 None
SERPING1 0.2991 0.8742 #E58700 0.7778 None
UBLCP1 -0.4921 -0.9514 #00C0AF 0.7768 None
EP300 -0.0234 0.1127 #FF62BC 0.7767 4
MBD5 0.1943 0.0586 #FE6E8A 0.7763 3
SH3KBP1 0.0117 0.0586 #FE6E8A 0.7759 5
TCF7L2 0.1338 -0.6031 #00BA38 0.7758 3
SRCAP -0.3623 -0.6031 #00BA38 0.7757 2
ATF7 0.1072 -0.6031 #00BA38 0.7752 None
FAM193A 0.0355 0.0586 #FE6E8A 0.7750 None
PIEZO2 -0.0784 -0.6750 #D376FF 0.7729 None
TBC1D8 0.0969 0.7287 #39B600 0.7716 None
ECD -0.0952 -0.0094 #00A7FF 0.7713 None
AGO2 -0.1465 -0.6031 #00BA38 0.7692 None
R3HDM2 -0.4078 -0.6031 #00BA38 0.7689 None
DAAM1 0.1579 -0.0094 #00A7FF 0.7682 None
SIRPB1 0.2253 0.1127 #FF62BC 0.7677 None
CDC42EP3 -0.3484 -0.9514 #00C0AF 0.7677 None



Negative samples distribution


  • The genes with the highest scores were affected the most as a group

  • More genes got their score increased than decreased but on average, the ones that got it decreased had a bigger change

negative_set = test_set %>% filter(!SFARI)

negative_set %>% mutate(diff = abs(prob-corrected_score)) %>% 
             ggplot(aes(prob, corrected_score, color = diff)) + geom_point(alpha=0.2) + scale_color_viridis() + 
             geom_abline(slope=1, intercept=0, color='gray', linetype='dashed') + 
             geom_smooth(color='#666666', alpha=0.5, se=TRUE, size=0.5) + coord_fixed() +
             xlab('Original probability') + ylab('Corrected probability') + theme_minimal() + theme(legend.position = 'none')

negative_set_table = negative_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
                     apply_labels(corrected_score = 'Corrected Probability', 
                                  corrected_pred = 'Corrected Class Prediction',
                                  pred = 'Original Class Prediction')

cro(negative_set_table$pred, list(negative_set_table$corrected_pred, total()))
 Corrected Class Prediction     #Total 
 FALSE   TRUE   
 Original Class Prediction 
   FALSE  9028 702   9730
   TRUE  582 4825   5407
   #Total cases  9610 5527   15137
cat(paste0('\n', round(100*mean(negative_set_table$corrected_pred == negative_set_table$pred)),
           '% of the genes maintained their original predicted class'))
## 
## 92% of the genes maintained their original predicted class
rm(negative_set_table)

Probability and Gene Significance


The relation is not as strong as before in the highest scores

*The transparent verison of the trend line is the original trend line

negative_set %>% ggplot(aes(corrected_score, GS, color=MTcor)) + geom_point() + geom_smooth(method='gam', color='#666666') +
                 geom_line(stat='smooth', method='gam', color='#666666', alpha=0.5, size=1.2, aes(x=prob)) +
                 geom_hline(yintercept=mean(negative_set$GS), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + xlab('Corrected Score') +
                 ggtitle('Relation between the Model\'s Corrected Score and Gene Significance') + theme_minimal()

Summarised version of score vs mean expression, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The transparent version of each point and trend lines are the original values and trends before the bias correction

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob),new_mean = mean(corrected_score),
                                                           new_sd = sd(corrected_score), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
            dplyr::select(Module, MTcor, MTcor_sign, mean, new_mean, sd, new_sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, new_mean, size=n, color=MTcor_sign)) + geom_point(aes(id = ID)) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         geom_point(aes(y=mean), alpha=0.3) + xlab('Module-Diagnosis correlation') + ylab('Mean Corrected Score by the Model') + 
         geom_line(stat='smooth', method='loess', color='gray', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) + 
         geom_line(stat='smooth', method='lm', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) + 
         theme_minimal() + theme(legend.position='none'))


Probability and mean level of expression


Check if correcting by gene also corrected by module: Yes, but not enough to remove the bias completely

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% mutate(ID=rownames(test_set)[!test_set$SFARI]) %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, Module), by='ID')

plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          new_meanProb = mean(corrected_score), n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, new_meanProb, size=n)) + 
         geom_point(color=plot_data2$Module) + geom_point(color=plot_data2$Module, alpha=0.3, aes(y=meanProb)) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_line(stat='smooth', method='loess', se=TRUE, color='gray', alpha=0.4, size=1.2, aes(y=meanProb)) +
         theme_minimal() + theme(legend.position='none') + xlab('Mean Expression') + ylab('Corrected Probability') +
         ggtitle('Mean expression vs corrected Model score by Module'))
rm(plot_data2, mean_and_sd)


Probability and SD of level of expression


The relation between SD and score became bigger than before

plot_data %>% ggplot(aes(sdExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + xlab('SD') + ylab('Corrected Probability') +
              geom_line(stat='smooth', method='lm', color='#999999', se=FALSE, alpha=0.4, size=1.5, aes(y=prob)) + 
              theme_minimal() + ggtitle('SD vs model probability by gene') + scale_x_sqrt()

This fitted curve looks like the opposite of the trend found between mean/sd and model scores

plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              scale_x_log10() + scale_y_log10() +  xlab('Mean Expression') + ylab('SD of Expression') +
              theme_minimal() + ggtitle('Mean expression vs SD by gene')


Probability and lfc


The relation seems to have gotten a bit stronger for the over-expressed genes and a bit weaker for the under-expressed genes

plot_data = negative_set %>% mutate(ID=rownames(test_set)[!test_set$SFARI]) %>% 
            left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% ggplot(aes(log2FoldChange, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.1) + xlab('LFC') + ylab('Corrected Probability') +
              geom_line(stat='smooth', method='loess', color='gray', alpha=0.4, size=1.5, aes(y=prob)) +
              theme_minimal() + ggtitle('LFC vs model probability by gene')

p1 = plot_data %>% filter(log2FoldChange<0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, corrected_score, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Corrected Probability') + 
                   ylim(c(min(plot_data$corrected_score), max(plot_data$corrected_score))) + 
                   geom_line(stat='smooth', method='loess', alpha=0.4, size=1.5, aes(y=prob, color = DE)) +
                   theme_minimal() + theme(legend.position = 'none', plot.margin=unit(c(1,-0.3,1,1), 'cm'))

p2 = plot_data %>% filter(log2FoldChange>=0) %>% mutate(DE = padj<0.05) %>% ggplot(aes(log2FoldChange, corrected_score, color=DE)) + geom_point(alpha=0.1) + 
                   geom_smooth(method='loess', alpha=0.1) + xlab('') + ylab('Corrected Probability') + ylab('') +
                   scale_y_continuous(position = 'right', limits = c(min(plot_data$corrected_score), max(plot_data$corrected_score))) +
                   geom_line(stat='smooth', method = 'loess', alpha=0.4, size=1.5, aes(y = prob, color = DE)) +
                   theme_minimal() + theme(plot.margin = unit(c(1,1,1,-0.3), 'cm'), axis.ticks.y = element_blank())

grid.arrange(p1, p2, nrow=1, top = 'LFC vs model probability by gene', bottom = 'LFC')


Probability and Module-Diagnosis correlation


Not much change

module_score = negative_set %>% mutate(ID=rownames(test_set)[!test_set$SFARI]) %>%
               left_join(original_dataset %>% mutate(ID = rownames(original_dataset)), by='ID') %>%
               dplyr::select(ID, prob, corrected_score, Module, MTcor.x) %>% rename(MTcor = MTcor.x) %>% 
               left_join(data.frame(MTcor=unique(dataset$MTcor)) %>% arrange(by=MTcor) %>% 
                         mutate(order=1:length(unique(dataset$MTcor))), by='MTcor')

ggplotly(module_score %>% ggplot(aes(MTcor, corrected_score)) + geom_point(color=module_score$Module, aes(id=ID, alpha=corrected_score^4)) +
         geom_hline(yintercept=mean(module_score$corrected_score), color='gray', linetype='dotted') + 
         geom_line(stat='smooth', method = 'loess', color='gray', alpha=0.5, size=1.5, aes(x=MTcor, y=prob)) +
         geom_smooth(color='gray', method = 'loess', se = FALSE, alpha=0.3) + theme_minimal() + 
         xlab('Module-Diagnosis correlation') + ylab('Corrected Score'))



Conclusion


This bias correction seems to be working but we may be losing a bit of biological signal on the way, mainly for under-expressed genes


Saving results

write.csv(test_set, file='./../Data/BC_post_proc_model.csv', row.names = TRUE)




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] caret_6.0-86       lattice_0.20-41    expss_0.10.2       ROCR_1.0-7        
##  [5] gplots_3.0.3       Rtsne_0.15         mgcv_1.8-31        nlme_3.1-147      
##  [9] biomaRt_2.40.5     RColorBrewer_1.1-2 gridExtra_2.3      viridis_0.5.1     
## [13] viridisLite_0.3.0  plotly_4.9.2       knitr_1.28         forcats_0.5.0     
## [17] stringr_1.4.0      dplyr_0.8.5        purrr_0.3.3        readr_1.3.1       
## [21] tidyr_1.0.2        tibble_3.0.0       ggplot2_3.3.0      tidyverse_1.3.0   
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1                backports_1.1.5            
##   [3] Hmisc_4.4-0                 plyr_1.8.6                 
##   [5] lazyeval_0.2.2              splines_3.6.3              
##   [7] crosstalk_1.1.0.1           BiocParallel_1.18.1        
##   [9] GenomeInfoDb_1.20.0         digest_0.6.25              
##  [11] foreach_1.5.0               htmltools_0.4.0            
##  [13] gdata_2.18.0                fansi_0.4.1                
##  [15] magrittr_1.5                checkmate_2.0.0            
##  [17] memoise_1.1.0               cluster_2.1.0              
##  [19] recipes_0.1.10              annotate_1.62.0            
##  [21] modelr_0.1.6                gower_0.2.1                
##  [23] matrixStats_0.56.0          prettyunits_1.1.1          
##  [25] jpeg_0.1-8.1                colorspace_1.4-1           
##  [27] blob_1.2.1                  rvest_0.3.5                
##  [29] haven_2.2.0                 xfun_0.12                  
##  [31] crayon_1.3.4                RCurl_1.98-1.1             
##  [33] jsonlite_1.6.1              genefilter_1.66.0          
##  [35] survival_3.1-11             iterators_1.0.12           
##  [37] glue_1.3.2                  gtable_0.3.0               
##  [39] ipred_0.9-9                 zlibbioc_1.30.0            
##  [41] XVector_0.24.0              DelayedArray_0.10.0        
##  [43] shape_1.4.4                 BiocGenerics_0.30.0        
##  [45] scales_1.1.0                DBI_1.1.0                  
##  [47] Rcpp_1.0.4                  xtable_1.8-4               
##  [49] progress_1.2.2              htmlTable_1.13.3           
##  [51] foreign_0.8-75              bit_1.1-15.2               
##  [53] Formula_1.2-3               stats4_3.6.3               
##  [55] lava_1.6.7                  prodlim_2019.11.13         
##  [57] glmnet_3.0-2                htmlwidgets_1.5.1          
##  [59] httr_1.4.1                  acepack_1.4.1              
##  [61] ellipsis_0.3.0              farver_2.0.3               
##  [63] pkgconfig_2.0.3             XML_3.99-0.3               
##  [65] nnet_7.3-13                 dbplyr_1.4.2               
##  [67] locfit_1.5-9.4              labeling_0.3               
##  [69] tidyselect_1.0.0            rlang_0.4.5                
##  [71] reshape2_1.4.3              AnnotationDbi_1.46.1       
##  [73] munsell_0.5.0               cellranger_1.1.0           
##  [75] tools_3.6.3                 cli_2.0.2                  
##  [77] generics_0.0.2              RSQLite_2.2.0              
##  [79] broom_0.5.5                 evaluate_0.14              
##  [81] yaml_2.2.1                  ModelMetrics_1.2.2.2       
##  [83] bit64_0.9-7                 fs_1.4.0                   
##  [85] caTools_1.18.0              xml2_1.2.5                 
##  [87] compiler_3.6.3              rstudioapi_0.11            
##  [89] curl_4.3                    png_0.1-7                  
##  [91] e1071_1.7-3                 reprex_0.3.0               
##  [93] geneplotter_1.62.0          stringi_1.4.6              
##  [95] highr_0.8                   Matrix_1.2-18              
##  [97] vctrs_0.2.4                 pillar_1.4.3               
##  [99] lifecycle_0.2.0             data.table_1.12.8          
## [101] bitops_1.0-6                GenomicRanges_1.36.1       
## [103] R6_2.4.1                    latticeExtra_0.6-29        
## [105] KernSmooth_2.23-16          IRanges_2.18.3             
## [107] codetools_0.2-16            MASS_7.3-51.5              
## [109] gtools_3.8.2                assertthat_0.2.1           
## [111] SummarizedExperiment_1.14.1 DESeq2_1.24.0              
## [113] withr_2.1.2                 S4Vectors_0.22.1           
## [115] GenomeInfoDbData_1.2.1      parallel_3.6.3             
## [117] hms_0.5.3                   grid_3.6.3                 
## [119] rpart_4.1-15                timeDate_3043.102          
## [121] class_7.3-16                rmarkdown_2.1              
## [123] pROC_1.16.2                 Biobase_2.44.0             
## [125] lubridate_1.7.4             base64enc_0.1-3